電子書: Website Scraping with Python Using BeautifulSoup and Scrapy https://link.springer.com/book/10.1007/978-1-4842-3925-4
import os import bs4 def _remove_h123_attrs(soup): tag_order = 0 for tag in soup.find_all(['h1', 'h2', 'h3']): # 假如標註內容沒有字串 #if len(tag.text) == 0: if len(tag.contents) ==0: # 且該標註為排序第一 if tag_order == 0: tag.string = "First" else: # 若該標註非排序第一, 則移除無內容的標題標註 tag.extract() # 針對單一元件的標題標註 elif len(tag.contents) == 1: # 若內容非為純文字, 表示內容為其他標註物件 if tag.get_text() == "": # 且該標註為排序第一 if tag_order == 0: # 在最前方插入標題 tag.insert_before(soup.new_tag('h1', 'First')) else: # 移除 h1, h2 或 h3 標註, 只留下內容 tag.replaceWithChildren() # 表示單一元件的標題標註, 且標題為單一字串者 else: # 判定若其排序第一, 則將 tag.name 為 h2 或 h3 者換為 h1 if tag_order == 0: tag.name = "h1" # 針對其餘單一字串內容的標註, 則保持原樣 # 針對內容一個以上的標題標註 #elif len(tag.contents) > 1: else: # 假如該標註內容長度大於 1 # 且該標註為排序第一 if tag_order == 0: # 先移除 h1, h2 或 h3 標註, 只留下內容 #tag.replaceWithChildren() # 在最前方插入標題 tag.insert_before(soup.new_tag('h1', 'First')) else: # 只保留標題內容, 去除 h1, h2 或 h3 標註 # 為了與前面的內文區隔, 先在最前面插入 br 標註 tag.insert_before(soup.new_tag('br')) # 再移除非排序第一的 h1, h2 或 h3 標註, 只留下內容 tag.replaceWithChildren() tag_order = tag_order + 1 return soup def file_get_contents(filename): # open file in utf-8 and return file content with open(filename, encoding="utf-8") as file: return file.read() def parse_content(): """use bs4 and re module functions to parse content.htm""" config_dir = "./" # if no content.htm, generate a head 1 and content 1 file if not os.path.isfile(config_dir+"content.htm"): # create content.htm if there is no content.htm File = open(config_dir + "content.htm", "w", encoding="utf-8") File.write("<h1>head 1</h1>content 1") File.close() subject = file_get_contents(config_dir+"content.htm") # deal with content without content if subject == "": # create content.htm if there is no content.htm File = open(config_dir + "content.htm", "w", encoding="utf-8") File.write("<h1>head 1</h1>content 1") File.close() subject = "<h1>head 1</h1>content 1" # initialize the return lists head_list = [] level_list = [] page_list = [] # make the soup out of the html content soup = bs4.BeautifulSoup(subject, 'html.parser') # 嘗試解讀各種情況下的標題 soup = _remove_h123_attrs(soup) # 改寫 content.htm 後重新取 subject with open(config_dir + "content.htm", "wb") as f: f.write(soup.encode("utf-8")) subject = file_get_contents(config_dir+"content.htm") # get all h1, h2, h3 tags into list htag= soup.find_all(['h1', 'h2', 'h3']) n = len(htag) # get the page content to split subject using each h tag temp_data = subject.split(str(htag[0])) if len(temp_data) > 2: subject = str(htag[0]).join(temp_data[1:]) else: subject = temp_data[1] if n >1: # i from 1 to i-1 for i in range(1, len(htag)): head_list.append(htag[i-1].text.strip()) # use name attribute of h* tag to get h1, h2 or h3 # the number of h1, h2 or h3 is the level of page menu level_list.append(htag[i-1].name[1]) temp_data = subject.split(str(htag[i])) if len(temp_data) > 2: subject = str(htag[i]).join(temp_data[1:]) else: subject = temp_data[1] # cut the other page content out of htag from 1 to i-1 cut = temp_data[0] # add the page content page_list.append(cut) # last i # add the last page title head_list.append(htag[n-1].text.strip()) # add the last level level_list.append(htag[n-1].name[1]) temp_data = subject.split(str(htag[n-1])) # the last subject subject = temp_data[0] # cut the last page content out cut = temp_data[0] # the last page content page_list.append(cut) return head_list, level_list, page_list print(parse_content())